import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_digits, fetch_california_housing
# Load the digits dataset
digits = load_digits()
digits_df = pd.DataFrame(data=digits.data, columns=[f'pixel_{i}' for i in range(digits.data.shape[1])])
digits_df['target'] = digits.target
# Load the California housing dataset
housing = fetch_california_housing()
housing_df = pd.DataFrame(data=housing.data, columns=housing.feature_names)
housing_df['MedHouseVal'] = housing.target
# EDA for California housing dataset
print(housing_df.info())
print("\n")
print(housing_df.head())
print("\n")
print(housing_df.describe())
print("\n")
sns.histplot(housing_df['MedHouseVal'], bins=30)
plt.title('Distribution of Median House Values')
plt.show()
# EDA for digits dataset
print(digits_df.info())
print("\n")
print(digits_df.describe())
print("\n")
sns.countplot(x='target', data=digits_df)
plt.title('Distribution of Digits')
plt.show()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20640 entries, 0 to 20639 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MedInc 20640 non-null float64 1 HouseAge 20640 non-null float64 2 AveRooms 20640 non-null float64 3 AveBedrms 20640 non-null float64 4 Population 20640 non-null float64 5 AveOccup 20640 non-null float64 6 Latitude 20640 non-null float64 7 Longitude 20640 non-null float64 8 MedHouseVal 20640 non-null float64 dtypes: float64(9) memory usage: 1.4 MB None MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \ 0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 Longitude MedHouseVal 0 -122.23 4.526 1 -122.22 3.585 2 -122.24 3.521 3 -122.25 3.413 4 -122.25 3.422 MedInc HouseAge AveRooms AveBedrms Population \ count 20640.000000 20640.000000 20640.000000 20640.000000 20640.000000 mean 3.870671 28.639486 5.429000 1.096675 1425.476744 std 1.899822 12.585558 2.474173 0.473911 1132.462122 min 0.499900 1.000000 0.846154 0.333333 3.000000 25% 2.563400 18.000000 4.440716 1.006079 787.000000 50% 3.534800 29.000000 5.229129 1.048780 1166.000000 75% 4.743250 37.000000 6.052381 1.099526 1725.000000 max 15.000100 52.000000 141.909091 34.066667 35682.000000 AveOccup Latitude Longitude MedHouseVal count 20640.000000 20640.000000 20640.000000 20640.000000 mean 3.070655 35.631861 -119.569704 2.068558 std 10.386050 2.135952 2.003532 1.153956 min 0.692308 32.540000 -124.350000 0.149990 25% 2.429741 33.930000 -121.800000 1.196000 50% 2.818116 34.260000 -118.490000 1.797000 75% 3.282261 37.710000 -118.010000 2.647250 max 1243.333333 41.950000 -114.310000 5.000010
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1797 entries, 0 to 1796 Data columns (total 65 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 pixel_0 1797 non-null float64 1 pixel_1 1797 non-null float64 2 pixel_2 1797 non-null float64 3 pixel_3 1797 non-null float64 4 pixel_4 1797 non-null float64 5 pixel_5 1797 non-null float64 6 pixel_6 1797 non-null float64 7 pixel_7 1797 non-null float64 8 pixel_8 1797 non-null float64 9 pixel_9 1797 non-null float64 10 pixel_10 1797 non-null float64 11 pixel_11 1797 non-null float64 12 pixel_12 1797 non-null float64 13 pixel_13 1797 non-null float64 14 pixel_14 1797 non-null float64 15 pixel_15 1797 non-null float64 16 pixel_16 1797 non-null float64 17 pixel_17 1797 non-null float64 18 pixel_18 1797 non-null float64 19 pixel_19 1797 non-null float64 20 pixel_20 1797 non-null float64 21 pixel_21 1797 non-null float64 22 pixel_22 1797 non-null float64 23 pixel_23 1797 non-null float64 24 pixel_24 1797 non-null float64 25 pixel_25 1797 non-null float64 26 pixel_26 1797 non-null float64 27 pixel_27 1797 non-null float64 28 pixel_28 1797 non-null float64 29 pixel_29 1797 non-null float64 30 pixel_30 1797 non-null float64 31 pixel_31 1797 non-null float64 32 pixel_32 1797 non-null float64 33 pixel_33 1797 non-null float64 34 pixel_34 1797 non-null float64 35 pixel_35 1797 non-null float64 36 pixel_36 1797 non-null float64 37 pixel_37 1797 non-null float64 38 pixel_38 1797 non-null float64 39 pixel_39 1797 non-null float64 40 pixel_40 1797 non-null float64 41 pixel_41 1797 non-null float64 42 pixel_42 1797 non-null float64 43 pixel_43 1797 non-null float64 44 pixel_44 1797 non-null float64 45 pixel_45 1797 non-null float64 46 pixel_46 1797 non-null float64 47 pixel_47 1797 non-null float64 48 pixel_48 1797 non-null float64 49 pixel_49 1797 non-null float64 50 pixel_50 1797 non-null float64 51 pixel_51 1797 non-null float64 52 pixel_52 1797 non-null float64 53 pixel_53 1797 non-null float64 54 pixel_54 1797 non-null float64 55 pixel_55 1797 non-null float64 56 pixel_56 1797 non-null float64 57 pixel_57 1797 non-null float64 58 pixel_58 1797 non-null float64 59 pixel_59 1797 non-null float64 60 pixel_60 1797 non-null float64 61 pixel_61 1797 non-null float64 62 pixel_62 1797 non-null float64 63 pixel_63 1797 non-null float64 64 target 1797 non-null int64 dtypes: float64(64), int64(1) memory usage: 912.7 KB None pixel_0 pixel_1 pixel_2 pixel_3 pixel_4 \ count 1797.0 1797.000000 1797.000000 1797.000000 1797.000000 mean 0.0 0.303840 5.204786 11.835838 11.848080 std 0.0 0.907192 4.754826 4.248842 4.287388 min 0.0 0.000000 0.000000 0.000000 0.000000 25% 0.0 0.000000 1.000000 10.000000 10.000000 50% 0.0 0.000000 4.000000 13.000000 13.000000 75% 0.0 0.000000 9.000000 15.000000 15.000000 max 0.0 8.000000 16.000000 16.000000 16.000000 pixel_5 pixel_6 pixel_7 pixel_8 pixel_9 ... \ count 1797.000000 1797.000000 1797.000000 1797.000000 1797.000000 ... mean 5.781859 1.362270 0.129661 0.005565 1.993879 ... std 5.666418 3.325775 1.037383 0.094222 3.196160 ... min 0.000000 0.000000 0.000000 0.000000 0.000000 ... 25% 0.000000 0.000000 0.000000 0.000000 0.000000 ... 50% 4.000000 0.000000 0.000000 0.000000 0.000000 ... 75% 11.000000 0.000000 0.000000 0.000000 3.000000 ... max 16.000000 16.000000 15.000000 2.000000 16.000000 ... pixel_55 pixel_56 pixel_57 pixel_58 pixel_59 \ count 1797.000000 1797.000000 1797.000000 1797.000000 1797.000000 mean 0.206455 0.000556 0.279354 5.557596 12.089037 std 0.984401 0.023590 0.934302 5.103019 4.374694 min 0.000000 0.000000 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 1.000000 11.000000 50% 0.000000 0.000000 0.000000 4.000000 13.000000 75% 0.000000 0.000000 0.000000 10.000000 16.000000 max 13.000000 1.000000 9.000000 16.000000 16.000000 pixel_60 pixel_61 pixel_62 pixel_63 target count 1797.000000 1797.000000 1797.000000 1797.000000 1797.000000 mean 11.809126 6.764051 2.067891 0.364496 4.490818 std 4.933947 5.900623 4.090548 1.860122 2.865304 min 0.000000 0.000000 0.000000 0.000000 0.000000 25% 10.000000 0.000000 0.000000 0.000000 2.000000 50% 14.000000 6.000000 0.000000 0.000000 4.000000 75% 16.000000 12.000000 2.000000 0.000000 7.000000 max 16.000000 16.000000 16.000000 16.000000 9.000000 [8 rows x 65 columns]
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# Load the digits dataset
digits = datasets.load_digits()
X = digits.data
y = digits.target
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Decision Tree Classifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)
# Train Random Forest Classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
# Calculate accuracy
dt_accuracy = accuracy_score(y_test, y_pred_dt)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
# Print accuracy scores
print(f"Decision Tree Accuracy: {dt_accuracy * 100:.2f}%")
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")
print("\n")
# Plot confusion matrices
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
# Decision Tree Confusion Matrix
dt_cm = confusion_matrix(y_test, y_pred_dt)
ConfusionMatrixDisplay(dt_cm).plot(ax=ax[0], cmap=plt.cm.Blues)
ax[0].set_title('Decision Tree Confusion Matrix')
# Random Forest Confusion Matrix
rf_cm = confusion_matrix(y_test, y_pred_rf)
ConfusionMatrixDisplay(rf_cm).plot(ax=ax[1], cmap=plt.cm.Blues)
ax[1].set_title('Random Forest Confusion Matrix')
plt.show()
Decision Tree Accuracy: 85.00% Random Forest Accuracy: 97.50%
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# Load the digits dataset
digits = datasets.load_digits()
X = digits.data
y = digits.target
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Tuning Decision Tree
dt_param_grid = {
'max_depth': [None, 5, 10, 20],
'min_samples_split': [2, 5, 10]
}
dt_grid_search = GridSearchCV(DecisionTreeClassifier(), dt_param_grid, cv=5)
dt_grid_search.fit(X_train, y_train)
y_pred_dt_tuned = dt_grid_search.predict(X_test)
# Tuning Random Forest
rf_param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 5, 10, 20]
}
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=5)
rf_grid_search.fit(X_train, y_train)
y_pred_rf_tuned = rf_grid_search.predict(X_test)
# Calculate accuracy for tuned models
dt_tuned_accuracy = accuracy_score(y_test, y_pred_dt_tuned)
rf_tuned_accuracy = accuracy_score(y_test, y_pred_rf_tuned)
# Print accuracy scores
print(f"Tuned Decision Tree Accuracy: {dt_tuned_accuracy * 100:.2f}%")
print(f"Tuned Random Forest Accuracy: {rf_tuned_accuracy * 100:.2f}%")
print("\n")
# Print Confusion Matrices
print("Tuned Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt_tuned))
print("\n")
print("Tuned Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf_tuned))
# Plot confusion matrices
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
# Decision Tree Confusion Matrix
dt_cm = confusion_matrix(y_test, y_pred_dt_tuned)
ConfusionMatrixDisplay(dt_cm).plot(ax=ax[0], cmap=plt.cm.Blues)
ax[0].set_title('Tuned Decision Tree Confusion Matrix')
# Random Forest Confusion Matrix
rf_cm = confusion_matrix(y_test, y_pred_rf_tuned)
ConfusionMatrixDisplay(rf_cm).plot(ax=ax[1], cmap=plt.cm.Blues)
ax[1].set_title('Tuned Random Forest Confusion Matrix')
plt.show()
Tuned Decision Tree Accuracy: 86.39% Tuned Random Forest Accuracy: 97.50% Tuned Decision Tree Confusion Matrix: [[29 0 0 0 2 1 0 0 0 1] [ 0 21 1 0 3 0 0 1 1 1] [ 0 1 26 2 2 0 0 1 1 0] [ 0 0 0 30 0 0 1 0 1 2] [ 0 2 0 0 41 0 0 3 0 0] [ 0 0 0 1 1 45 0 0 0 0] [ 0 0 0 0 1 0 34 0 0 0] [ 0 0 0 2 2 0 0 30 0 0] [ 1 3 0 1 2 1 0 0 20 2] [ 0 0 0 3 0 0 0 2 0 35]] Tuned Random Forest Confusion Matrix: [[32 0 0 0 1 0 0 0 0 0] [ 0 28 0 0 0 0 0 0 0 0] [ 0 0 33 0 0 0 0 0 0 0] [ 0 0 0 33 0 1 0 0 0 0] [ 0 0 0 0 46 0 0 0 0 0] [ 0 0 0 0 0 45 1 0 0 1] [ 0 0 0 0 0 1 34 0 0 0] [ 0 0 0 0 0 0 0 33 0 1] [ 0 1 0 0 0 0 0 0 29 0] [ 0 0 0 0 0 1 0 1 0 38]]
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
# Split the dataset
X_housing = housing.data
y_housing = housing.target
X_train_housing, X_test_housing, y_train_housing, y_test_housing = train_test_split(X_housing, y_housing, test_size=0.2, random_state=42)
# Decision Tree Regressor
dt_regressor = DecisionTreeRegressor()
dt_regressor.fit(X_train_housing, y_train_housing)
y_pred_dt_reg = dt_regressor.predict(X_test_housing)
# Random Forest Regressor
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train_housing, y_train_housing)
y_pred_rf_reg = rf_regressor.predict(X_test_housing)
# Metrics
print("Decision Tree Regression MSE:", mean_squared_error(y_test_housing, y_pred_dt_reg))
print("Decision Tree R^2:", r2_score(y_test_housing, y_pred_dt_reg))
print("Random Forest Regression MSE:", mean_squared_error(y_test_housing, y_pred_rf_reg))
print("Random Forest R^2:", r2_score(y_test_housing, y_pred_rf_reg))
Decision Tree Regression MSE: 0.4950806416996609 Decision Tree R^2: 0.6221937960435375 Random Forest Regression MSE: 0.2540368314402719 Random Forest R^2: 0.8061392773870548
from sklearn import datasets
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
# Load the digits dataset
digits = datasets.load_digits()
X = digits.data
y = digits.target
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Tuning Decision Tree using RandomizedSearchCV
dt_param_dist = {
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5]
}
dt_random_search = RandomizedSearchCV(DecisionTreeClassifier(), dt_param_dist, n_iter=3, cv=3, random_state=42)
dt_random_search.fit(X_train, y_train)
y_pred_dt_tuned = dt_random_search.predict(X_test)
# Tuning Random Forest using RandomizedSearchCV
rf_param_dist = {
'n_estimators': [50, 100],
'max_depth': [None, 5, 10]
}
rf_random_search = RandomizedSearchCV(RandomForestClassifier(), rf_param_dist, n_iter=3, cv=3, random_state=42)
rf_random_search.fit(X_train, y_train)
y_pred_rf_tuned = rf_random_search.predict(X_test)
# Calculate accuracy for tuned models
dt_tuned_accuracy = accuracy_score(y_test, y_pred_dt_tuned)
rf_tuned_accuracy = accuracy_score(y_test, y_pred_rf_tuned)
# Print accuracy scores
print(f"Tuned Decision Tree Accuracy: {dt_tuned_accuracy * 100:.2f}%")
print(f"Tuned Random Forest Accuracy: {rf_tuned_accuracy * 100:.2f}%")
# Print Confusion Matrices
print("Tuned Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt_tuned))
print("Tuned Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf_tuned))
# Plot confusion matrices
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
# Decision Tree Confusion Matrix
dt_cm = confusion_matrix(y_test, y_pred_dt_tuned)
ConfusionMatrixDisplay(dt_cm).plot(ax=ax[0], cmap=plt.cm.Blues)
ax[0].set_title('Tuned Decision Tree Confusion Matrix')
# Random Forest Confusion Matrix
rf_cm = confusion_matrix(y_test, y_pred_rf_tuned)
ConfusionMatrixDisplay(rf_cm).plot(ax=ax[1], cmap=plt.cm.Blues)
ax[1].set_title('Tuned Random Forest Confusion Matrix')
plt.show()
Tuned Decision Tree Accuracy: 83.89% Tuned Random Forest Accuracy: 98.33% Tuned Decision Tree Confusion Matrix: [[29 0 1 0 1 1 0 0 0 1] [ 0 22 1 0 1 0 0 1 2 1] [ 1 0 24 3 2 0 1 1 1 0] [ 0 0 1 30 1 0 0 0 1 1] [ 0 0 1 0 40 1 2 1 1 0] [ 0 0 4 0 1 40 1 0 1 0] [ 0 0 0 0 3 0 32 0 0 0] [ 0 0 0 1 2 0 0 30 0 1] [ 0 2 1 2 0 1 0 0 21 3] [ 0 0 0 2 1 0 0 2 1 34]] Tuned Random Forest Confusion Matrix: [[32 0 0 0 1 0 0 0 0 0] [ 0 28 0 0 0 0 0 0 0 0] [ 0 0 33 0 0 0 0 0 0 0] [ 0 0 0 33 0 1 0 0 0 0] [ 0 0 0 0 46 0 0 0 0 0] [ 0 0 0 0 0 47 0 0 0 0] [ 0 0 0 0 0 1 34 0 0 0] [ 0 0 0 0 0 0 0 33 0 1] [ 0 1 0 0 0 0 0 0 29 0] [ 0 0 0 0 0 0 0 1 0 39]]
# Feature Importance for Classification
print("Decision Tree Feature Importance:")
print(dt_classifier.feature_importances_)
print("\n")
print("Random Forest Feature Importance:")
print(rf_classifier.feature_importances_)
print("\n")
# Feature Importance for Regression
print("Decision Tree Regressor Feature Importance:")
print(dt_regressor.feature_importances_)
print("\n")
print("Random Forest Regressor Feature Importance:")
print(rf_regressor.feature_importances_)
Decision Tree Feature Importance: [0. 0.01036013 0.00451836 0.00569966 0.00243391 0.05963066 0. 0. 0. 0.00103113 0.03424813 0.00077334 0.0116301 0.00564541 0.00292152 0. 0.00153196 0. 0.01859643 0.01277246 0.05068993 0.10083092 0.00123735 0. 0.00151764 0.00077334 0.07175765 0.06311472 0.00736352 0.00434546 0.0091637 0. 0. 0.05878518 0.00221174 0.00625672 0.07837266 0.01672474 0.00365746 0. 0. 0.00269306 0.13063307 0.04837164 0.00077334 0. 0.01559676 0. 0. 0.0021267 0.00709405 0.00858535 0.0007347 0.00123735 0.01724102 0.00152988 0. 0. 0.00588908 0.00077334 0.06531677 0.03235316 0.00144358 0.00901124] Random Forest Feature Importance: [0.00000000e+00 2.70631473e-03 2.14913233e-02 1.05092028e-02 1.00978014e-02 1.64898689e-02 8.04677766e-03 8.26333795e-04 6.37017762e-05 1.02515402e-02 2.73231438e-02 5.72023318e-03 1.54534256e-02 2.61916936e-02 4.56128411e-03 5.37956710e-04 7.06270618e-05 7.64725651e-03 2.13004239e-02 2.58066462e-02 3.12641697e-02 4.74594251e-02 8.83202224e-03 6.92691968e-04 1.45774767e-05 1.55011221e-02 4.10820139e-02 2.76058306e-02 3.49759665e-02 2.16279796e-02 3.65400138e-02 4.24236622e-05 0.00000000e+00 3.38336410e-02 2.45128606e-02 2.00785391e-02 3.60605647e-02 1.89641527e-02 2.30882573e-02 0.00000000e+00 2.26227020e-05 1.08058930e-02 3.24222740e-02 4.60223811e-02 2.00973620e-02 2.07301330e-02 1.48531087e-02 8.16271063e-05 2.87159687e-05 3.05501233e-03 1.62098922e-02 2.02596358e-02 1.30036464e-02 2.28898713e-02 2.81774465e-02 1.99790301e-03 0.00000000e+00 2.28124933e-03 2.16805731e-02 9.25950631e-03 3.22491832e-02 2.59498824e-02 1.63248297e-02 4.32544363e-03] Decision Tree Regressor Feature Importance: [0.52747092 0.05184666 0.05279608 0.02815275 0.03082368 0.13166008 0.09266352 0.08458631] Random Forest Regressor Feature Importance: [0.52525632 0.05456008 0.04325463 0.02928013 0.03073374 0.13886681 0.08872977 0.08931852]